#import required libraries and pandas to read the raw data csv file to a dataframe
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Reading the raw dataset
raw_dataset = pd.read_csv('data/ConsolidatedDataV2.csv')
# The number of rows and columns in the dataset
raw_dataset.shape
(2856, 30)
# Printing the first 6 rows of the dataset
raw_dataset.head()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 2011 | 76.914 | 26.2 | 22.9 | NaN | 5.03 | 100.0 | 97.0 | 103 | ... | 4.795327 | NaN | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 | 99.0 | 99.0 | 9.29 |
| 1 | Albania | 2012 | 77.252 | 26.3 | 23.1 | NaN | 4.43 | 100.0 | 96.0 | 103 | ... | 5.055262 | NaN | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 | 99.0 | 99.0 | 9.29 |
| 2 | Albania | 2013 | 77.554 | 26.4 | 23.6 | NaN | 4.28 | 100.0 | 99.0 | 100 | ... | 5.385599 | NaN | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 | 99.0 | 99.0 | 8.29 |
| 3 | Algeria | 2014 | 75.878 | 25.4 | 33.9 | NaN | 0.54 | 200.0 | 99.0 | 98 | ... | 6.547214 | NaN | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 | 99.0 | 95.0 | 3107.90 |
| 4 | Algeria | 2015 | 76.090 | 25.5 | 33.9 | NaN | 0.55 | 200.0 | 99.0 | 96 | ... | 6.978492 | NaN | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 | 99.0 | 95.0 | 3208.10 |
5 rows × 30 columns
# Printing the last 6 rows of the dataset
raw_dataset.tail()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2851 | Zimbabwe | 2011 | 52.896 | 23.7 | 48.3 | 2.0 | 3.91 | 40000.0 | 98.0 | 440 | ... | 8.081738 | NaN | 93.0 | 34.3 | 842.2 | 4.6 | 94.0 | NaN | 93.0 | 1512.0 |
| 2852 | Zimbabwe | 2012 | 55.032 | 23.7 | 46.2 | 1.0 | 3.93 | 33000.0 | 98.0 | 407 | ... | 6.918353 | NaN | 95.0 | 33.1 | 826.4 | 4.5 | 97.0 | NaN | 95.0 | 1612.0 |
| 2853 | Zimbabwe | 2013 | 56.897 | 23.7 | 44.3 | NaN | 4.11 | 28000.0 | 95.0 | 383 | ... | 7.110148 | NaN | 95.0 | 31.4 | 810.2 | 4.5 | 95.0 | NaN | 95.0 | 1209.3 |
| 2854 | Zimbabwe | 2014 | 58.410 | 23.8 | 42.8 | NaN | 4.22 | 25000.0 | 99.0 | 358 | ... | 8.133524 | NaN | 91.0 | 30.8 | 804.3 | 4.4 | 91.0 | NaN | 92.0 | 1410.0 |
| 2855 | Zimbabwe | 2015 | 59.534 | 23.8 | 41.7 | 0.0 | 3.84 | 24000.0 | 90.0 | 346 | ... | 7.452066 | NaN | 87.0 | 30.7 | 800.1 | 4.3 | 87.0 | NaN | 88.0 | 1107.9 |
5 rows × 30 columns
# Count, Mean , Min and MAx values of the columns.
raw_dataset.describe()
| Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ChildMortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2856.000000 | 2567.000000 | 2856.000000 | 2822.000000 | 781.000000 | 2800.000000 | 2040.000000 | 2344.000000 | 2856.000000 | 2.856000e+03 | ... | 2458.000000 | 782.000000 | 2837.000000 | 2856.000000 | 2856.000000 | 2856.000000 | 2288.000000 | 1629.000000 | 2820.000000 | 2799.000000 |
| mean | 2008.000000 | 68.795037 | 24.562185 | 36.663466 | 83.691421 | 4.814093 | 8918.289216 | 89.593003 | 194.384804 | 6.725880e+04 | ... | 6.079373 | 66.125614 | 86.272823 | 11.992017 | 605.322024 | 1.498704 | 85.366696 | 84.666053 | 86.427660 | 8995.936395 |
| std | 4.899837 | 9.863356 | 3.390756 | 21.199494 | 301.232338 | 3.939412 | 24522.502650 | 13.897708 | 116.830456 | 2.234386e+05 | ... | 2.435629 | 9.043176 | 15.585910 | 9.989647 | 186.165083 | 1.588911 | 17.253674 | 18.385550 | 15.209758 | 43737.466927 |
| min | 2000.000000 | 39.441000 | 0.000000 | 5.800000 | 0.000000 | 0.000000 | 100.000000 | 16.000000 | 49.000000 | 1.800000e+01 | ... | 1.025159 | 57.035300 | 19.000000 | 0.100000 | 240.400000 | 0.000000 | 2.000000 | 2.000000 | 8.000000 | 0.000000 |
| 25% | 2004.000000 | 61.433500 | 23.000000 | 19.500000 | 0.000000 | 1.240000 | 200.000000 | 87.000000 | 104.000000 | 1.062500e+03 | ... | 4.152656 | 61.876000 | 82.000000 | 5.800000 | 456.575000 | 0.300000 | 81.000000 | 81.000000 | 82.000000 | 67.050000 |
| 50% | 2008.000000 | 71.710000 | 25.400000 | 30.400000 | 4.000000 | 4.015000 | 975.000000 | 95.000000 | 164.000000 | 8.016500e+03 | ... | 5.835874 | 64.249950 | 93.000000 | 10.000000 | 604.500000 | 0.800000 | 92.000000 | 92.000000 | 93.000000 | 646.000000 |
| 75% | 2012.000000 | 76.129000 | 26.400000 | 52.475000 | 38.000000 | 7.752500 | 5600.000000 | 98.000000 | 260.000000 | 5.138925e+04 | ... | 7.939192 | 67.866425 | 97.000000 | 15.000000 | 725.600000 | 2.400000 | 96.000000 | 97.000000 | 97.000000 | 3107.800000 |
| max | 2016.000000 | 84.090000 | 29.600000 | 88.400000 | 3990.000000 | 17.870000 | 290000.000000 | 99.000000 | 697.000000 | 2.025425e+06 | ... | 20.413412 | 126.126400 | 99.000000 | 116.200000 | 1317.700000 | 9.400000 | 99.000000 | 99.000000 | 99.000000 | 615058.000000 |
8 rows × 29 columns
# To understand the data types of the columns
raw_dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2856 entries, 0 to 2855 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 2856 non-null object 1 Year 2856 non-null int64 2 Life expectancy 2567 non-null float64 3 BMI 2856 non-null float64 4 ChildMalnutrition 2822 non-null float64 5 Cholera 781 non-null float64 6 Alcohol 2800 non-null float64 7 HIV 2040 non-null float64 8 BCG 2344 non-null float64 9 Adult Mortality 2856 non-null int64 10 ChildMortality 2856 non-null int64 11 Population 2528 non-null float64 12 Eggs Consumption 2339 non-null float64 13 Bovine Meat 2339 non-null float64 14 Mutton & Goat meat 2339 non-null float64 15 Other Meat 2339 non-null float64 16 Pig Meat 2273 non-null float64 17 Poultry Meat 2339 non-null float64 18 Milk Consumption 2339 non-null float64 19 Fish and Seafood 2339 non-null float64 20 Medical Expenditure 2458 non-null float64 21 Retirement Age 782 non-null float64 22 Diphtheria 2837 non-null float64 23 Suicides 2856 non-null float64 24 NCD 2856 non-null float64 25 Env Pollution 2856 non-null float64 26 HepatitisB 2288 non-null float64 27 Measles 1629 non-null float64 28 Polio 2820 non-null float64 29 Tuberculosis 2799 non-null float64 dtypes: float64(26), int64(3), object(1) memory usage: 669.5+ KB
dataset = raw_dataset
## The dataset consists of “year” column which is time series data. So the datatype has been changed to datetime
dataset['Year'] = pd.to_datetime(dataset['Year'] , format='%Y', errors='ignore')
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2856 entries, 0 to 2855 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 2856 non-null object 1 Year 2856 non-null datetime64[ns] 2 Life expectancy 2567 non-null float64 3 BMI 2856 non-null float64 4 ChildMalnutrition 2822 non-null float64 5 Cholera 781 non-null float64 6 Alcohol 2800 non-null float64 7 HIV 2040 non-null float64 8 BCG 2344 non-null float64 9 Adult Mortality 2856 non-null int64 10 ChildMortality 2856 non-null int64 11 Population 2528 non-null float64 12 Eggs Consumption 2339 non-null float64 13 Bovine Meat 2339 non-null float64 14 Mutton & Goat meat 2339 non-null float64 15 Other Meat 2339 non-null float64 16 Pig Meat 2273 non-null float64 17 Poultry Meat 2339 non-null float64 18 Milk Consumption 2339 non-null float64 19 Fish and Seafood 2339 non-null float64 20 Medical Expenditure 2458 non-null float64 21 Retirement Age 782 non-null float64 22 Diphtheria 2837 non-null float64 23 Suicides 2856 non-null float64 24 NCD 2856 non-null float64 25 Env Pollution 2856 non-null float64 26 HepatitisB 2288 non-null float64 27 Measles 1629 non-null float64 28 Polio 2820 non-null float64 29 Tuberculosis 2799 non-null float64 dtypes: datetime64[ns](1), float64(26), int64(2), object(1) memory usage: 669.5+ KB
# Printing the first 6 rows of the dataset
dataset.head()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 2011-01-01 | 76.914 | 26.2 | 22.9 | NaN | 5.03 | 100.0 | 97.0 | 103 | ... | 4.795327 | NaN | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 | 99.0 | 99.0 | 9.29 |
| 1 | Albania | 2012-01-01 | 77.252 | 26.3 | 23.1 | NaN | 4.43 | 100.0 | 96.0 | 103 | ... | 5.055262 | NaN | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 | 99.0 | 99.0 | 9.29 |
| 2 | Albania | 2013-01-01 | 77.554 | 26.4 | 23.6 | NaN | 4.28 | 100.0 | 99.0 | 100 | ... | 5.385599 | NaN | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 | 99.0 | 99.0 | 8.29 |
| 3 | Algeria | 2014-01-01 | 75.878 | 25.4 | 33.9 | NaN | 0.54 | 200.0 | 99.0 | 98 | ... | 6.547214 | NaN | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 | 99.0 | 95.0 | 3107.90 |
| 4 | Algeria | 2015-01-01 | 76.090 | 25.5 | 33.9 | NaN | 0.55 | 200.0 | 99.0 | 96 | ... | 6.978492 | NaN | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 | 99.0 | 95.0 | 3208.10 |
5 rows × 30 columns
#Total columns = 30
# Total rows = 2856
# to check the missing values in the columns and view in descending order
dataset.isnull().sum().sort_values(ascending=False)
Cholera 2075 Retirement Age 2074 Measles 1227 HIV 816 Pig Meat 583 HepatitisB 568 Eggs Consumption 517 Milk Consumption 517 Mutton & Goat meat 517 Bovine Meat 517 Other Meat 517 Fish and Seafood 517 Poultry Meat 517 BCG 512 Medical Expenditure 398 Population 328 Life expectancy 289 Tuberculosis 57 Alcohol 56 Polio 36 ChildMalnutrition 34 Diphtheria 19 Year 0 ChildMortality 0 Adult Mortality 0 Suicides 0 NCD 0 Env Pollution 0 BMI 0 Country 0 dtype: int64
# Drop the columns with more than 50% of missing data
# Cholera 2075
# Retirement Age 2074
# Measles 1227
#drop the retitrement column
dataset = dataset.drop(['Retirement Age'],axis=1)
# removing the cholera column more than 50% missing data
dataset = dataset.drop(['Cholera'],axis=1)
#drop the Measles column
dataset = dataset.drop(['Measles'],axis=1)
# Life Expectancy is the main focus of our research
# Life expectancy 289
# Imputing introduces bias which is not desirable . Hence, we delete this null rows
dataset=dataset[dataset['Life expectancy'].notna()]
# convert datatype of life expectancy to int after removing null values
dataset['Life expectancy'] = dataset['Life expectancy'].astype(np.int64)
# to check the missing values in the columns and view in descending order again after removing some columns
dataset.isnull().sum().sort_values(ascending=False)
HIV 731 HepatitisB 522 BCG 476 Pig Meat 294 Mutton & Goat meat 228 Other Meat 228 Fish and Seafood 228 Milk Consumption 228 Poultry Meat 228 Eggs Consumption 228 Bovine Meat 228 Medical Expenditure 187 Population 124 Alcohol 56 Tuberculosis 38 ChildMalnutrition 34 Polio 34 Diphtheria 17 Year 0 ChildMortality 0 Adult Mortality 0 BMI 0 Suicides 0 NCD 0 Env Pollution 0 Life expectancy 0 Country 0 dtype: int64
# Drop only the rows with missing values which are less than 10%
# Pig Meat 294
# Mutton & Goat meat 228
# Other Meat 228
# Fish and Seafood 228
# Milk Consumption 228
# Poultry Meat 228
# Eggs Consumption 228
# Bovine Meat 228
# Medical Expenditure 187
# Population 124
# Alcohol 56
# Tuberculosis 38
# ChildMalnutrition 34
# Polio 34
# Diphtheria 17
dataset=dataset[dataset['Pig Meat'].notna()]
dataset=dataset[dataset['Mutton & Goat meat'].notna()]
dataset=dataset[dataset['Other Meat'].notna()]
dataset=dataset[dataset['Fish and Seafood'].notna()]
dataset=dataset[dataset['Milk Consumption'].notna()]
dataset=dataset[dataset['Poultry Meat'].notna()]
dataset=dataset[dataset['Eggs Consumption'].notna()]
dataset=dataset[dataset['Bovine Meat'].notna()]
dataset=dataset[dataset['Medical Expenditure'].notna()]
dataset=dataset[dataset['Population'].notna()]
dataset=dataset[dataset['Alcohol'].notna()]
dataset=dataset[dataset['Tuberculosis'].notna()]
dataset=dataset[dataset['ChildMalnutrition'].notna()]
dataset=dataset[dataset['Polio'].notna()]
dataset=dataset[dataset['Diphtheria'].notna()]
# replacing the missing values with mean value
# missing data is about 25% and deletion causes too much data loss from the dataset ( rows decrease drastically)
# HIV 731
# HepatitisB 522
# BCG 476
dataset['HIV']=dataset['HIV'].fillna(value=dataset['HIV'].mean())
dataset['BCG']=dataset['BCG'].fillna(value=dataset['BCG'].mean())
dataset['HepatitisB']=dataset['HepatitisB'].fillna(value=dataset['HepatitisB'].mean())
# To ignore warnings thrown by dataprep library
import warnings
warnings.filterwarnings('ignore')
# importing the dataprep library
import dataprep
from dataprep.eda import create_report
# generating the visual using create_report method
pre_processed_report = create_report(dataset, title='Pre-Processed Dataset')
pre_processed_report
NumExpr defaulting to 8 threads.
| Number of Variables | 27 |
|---|---|
| Number of Rows | 2074 |
| Missing Cells | 0 |
| Missing Cells (%) | 0.0% |
| Duplicate Rows | 0 |
| Duplicate Rows (%) | 0.0% |
| Total Size in Memory | 568.7 KB |
| Average Row Size in Memory | 280.8 B |
| Categorical | 1 |
|---|---|
| DateTime | 1 |
| Numerical | 25 |
categorical
| Distinct Count | 125 |
|---|---|
| Unique (%) | 6.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 147.5 KB |
| Mean | 7.8038 |
|---|---|
| Standard Deviation | 3.1386 |
| Median | 7 |
| Minimum | 4 |
| Maximum | 24 |
| 1st row | Albania |
|---|---|
| 2nd row | Albania |
| 3rd row | Albania |
| 4th row | Algeria |
| 5th row | Algeria |
| Count | 15929 |
|---|---|
| Lowercase Letter | 13633 |
| Space Separator | 239 |
| Uppercase Letter | 2296 |
| Dash Punctuation | 17 |
| Decimal Number | 0 |
datetime
| Distinct Count | 17 |
|---|---|
| Unique (%) | 0.8% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 32.4 KB |
| Minimum | 2000-01-01 00:00:00 |
| Maximum | 2016-01-01 00:00:00 |
numerical
| Distinct Count | 46 |
|---|---|
| Unique (%) | 2.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 68.7083 |
| Minimum | 39 |
| Maximum | 84 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 39 |
|---|---|
| 5-th Percentile | 49.65 |
| Q1 | 62 |
| Median | 71 |
| Q3 | 76 |
| 95-th Percentile | 81 |
| Maximum | 84 |
| Range | 45 |
| IQR | 14 |
| Mean | 68.7083 |
|---|---|
| Standard Deviation | 10.0026 |
| Variance | 100.0523 |
| Sum | 142501 |
| Skewness | -0.7547 |
| Kurtosis | -0.3761 |
| Coefficient of Variation | 0.1456 |
numerical
| Distinct Count | 94 |
|---|---|
| Unique (%) | 4.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 24.8506 |
| Minimum | 19.8 |
| Maximum | 29.1 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19.8 |
|---|---|
| 5-th Percentile | 21.4 |
| Q1 | 23.2 |
| Median | 25.5 |
| Q3 | 26.3 |
| 95-th Percentile | 27.3 |
| Maximum | 29.1 |
| Range | 9.3 |
| IQR | 3.1 |
| Mean | 24.8506 |
|---|---|
| Standard Deviation | 1.9323 |
| Variance | 3.7337 |
| Sum | 51540.2 |
| Skewness | -0.5303 |
| Kurtosis | -0.7565 |
| Coefficient of Variation | 0.07776 |
numerical
| Distinct Count | 652 |
|---|---|
| Unique (%) | 31.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 35.8725 |
| Minimum | 9.3 |
| Maximum | 88.4 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 9.3 |
|---|---|
| 5-th Percentile | 11.7 |
| Q1 | 18.8 |
| Median | 29.25 |
| Q3 | 50.5 |
| 95-th Percentile | 77.7 |
| Maximum | 88.4 |
| Range | 79.1 |
| IQR | 31.7 |
| Mean | 35.8725 |
|---|---|
| Standard Deviation | 21.6236 |
| Variance | 467.5794 |
| Sum | 74399.5 |
| Skewness | 0.7696 |
| Kurtosis | -0.6317 |
| Coefficient of Variation | 0.6028 |
numerical
| Distinct Count | 1006 |
|---|---|
| Unique (%) | 48.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 5.2056 |
| Minimum | 0 |
| Maximum | 17.87 |
| Zeros | 25 |
| Zeros (%) | 1.2% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.19 |
| Q1 | 1.68 |
| Median | 4.485 |
| Q3 | 8.1 |
| 95-th Percentile | 12.1 |
| Maximum | 17.87 |
| Range | 17.87 |
| IQR | 6.42 |
| Mean | 5.2056 |
|---|---|
| Standard Deviation | 3.9504 |
| Variance | 15.6055 |
| Sum | 10796.33 |
| Skewness | 0.5139 |
| Kurtosis | -0.6963 |
| Coefficient of Variation | 0.7589 |
numerical
| Distinct Count | 219 |
|---|---|
| Unique (%) | 10.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 9416.6428 |
| Minimum | 100 |
| Maximum | 290000 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 100 |
|---|---|
| 5-th Percentile | 100 |
| Q1 | 500 |
| Median | 3800 |
| Q3 | 9416.6428 |
| 95-th Percentile | 42350 |
| Maximum | 290000 |
| Range | 289900 |
| IQR | 8916.6428 |
| Mean | 9416.6428 |
|---|---|
| Standard Deviation | 22287.9745 |
| Variance | 4.9675e+08 |
| Sum | 1.953e+07 |
| Skewness | 6.8164 |
| Kurtosis | 62.0487 |
| Coefficient of Variation | 2.3669 |
numerical
| Distinct Count | 76 |
|---|---|
| Unique (%) | 3.7% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 89.9038 |
| Minimum | 16 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 16 |
|---|---|
| 5-th Percentile | 69 |
| Q1 | 89.9038 |
| Median | 92 |
| Q3 | 98 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 83 |
| IQR | 8.0962 |
| Mean | 89.9038 |
|---|---|
| Standard Deviation | 12.4791 |
| Variance | 155.7282 |
| Sum | 186460.5769 |
| Skewness | -3.0251 |
| Kurtosis | 11.3404 |
| Coefficient of Variation | 0.1388 |
numerical
| Distinct Count | 445 |
|---|---|
| Unique (%) | 21.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 193.2985 |
| Minimum | 49 |
| Maximum | 683 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 49 |
|---|---|
| 5-th Percentile | 62.65 |
| Q1 | 101 |
| Median | 161 |
| Q3 | 255 |
| 95-th Percentile | 445.35 |
| Maximum | 683 |
| Range | 634 |
| IQR | 154 |
| Mean | 193.2985 |
|---|---|
| Standard Deviation | 119.867 |
| Variance | 14368.0976 |
| Sum | 400901 |
| Skewness | 1.2728 |
| Kurtosis | 1.4242 |
| Coefficient of Variation | 0.6201 |
numerical
| Distinct Count | 124 |
|---|---|
| Unique (%) | 6.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 75008.4851 |
| Minimum | 18 |
| Maximum | 2.0254e+06 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 18 |
|---|---|
| 5-th Percentile | 101 |
| Q1 | 979 |
| Median | 6704 |
| Q3 | 52218 |
| 95-th Percentile | 219826 |
| Maximum | 2.0254e+06 |
| Range | 2.0254e+06 |
| IQR | 51239 |
| Mean | 75008.4851 |
|---|---|
| Standard Deviation | 254538.4035 |
| Variance | 6.479e+10 |
| Sum | 1.5557e+08 |
| Skewness | 6.1166 |
| Kurtosis | 39.4687 |
| Coefficient of Variation | 3.3935 |
numerical
| Distinct Count | 2073 |
|---|---|
| Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 4.3879e+07 |
| Minimum | 247315 |
| Maximum | 1.3787e+09 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 247315 |
|---|---|
| 5-th Percentile | 515042.65 |
| Q1 | 3.5037e+06 |
| Median | 9.4696e+06 |
| Q3 | 2.4687e+07 |
| 95-th Percentile | 1.4258e+08 |
| Maximum | 1.3787e+09 |
| Range | 1.3784e+09 |
| IQR | 2.1183e+07 |
| Mean | 4.3879e+07 |
|---|---|
| Standard Deviation | 1.6239e+08 |
| Variance | 2.6371e+16 |
| Sum | 9.1006e+10 |
| Skewness | 7.019 |
| Kurtosis | 50.2303 |
| Coefficient of Variation | 3.7009 |
numerical
| Distinct Count | 1105 |
|---|---|
| Unique (%) | 53.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.6446 |
| Minimum | 0.01 |
| Maximum | 22.35 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.01 |
|---|---|
| 5-th Percentile | 0.46 |
| Q1 | 1.75 |
| Median | 6.13 |
| Q3 | 10.18 |
| 95-th Percentile | 15.721 |
| Maximum | 22.35 |
| Range | 22.34 |
| IQR | 8.43 |
| Mean | 6.6446 |
|---|---|
| Standard Deviation | 5.0699 |
| Variance | 25.7043 |
| Sum | 13780.87 |
| Skewness | 0.5265 |
| Kurtosis | -0.6652 |
| Coefficient of Variation | 0.763 |
numerical
| Distinct Count | 1381 |
|---|---|
| Unique (%) | 66.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 11.6437 |
| Minimum | 0.26 |
| Maximum | 59.09 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.26 |
|---|---|
| 5-th Percentile | 1.5795 |
| Q1 | 4.99 |
| Median | 8.325 |
| Q3 | 17.35 |
| 95-th Percentile | 28.1235 |
| Maximum | 59.09 |
| Range | 58.83 |
| IQR | 12.36 |
| Mean | 11.6437 |
|---|---|
| Standard Deviation | 9.2928 |
| Variance | 86.3565 |
| Sum | 24149.09 |
| Skewness | 1.5618 |
| Kurtosis | 3.4582 |
| Coefficient of Variation | 0.7981 |
numerical
| Distinct Count | 726 |
|---|---|
| Unique (%) | 35.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 3.3526 |
| Minimum | 0 |
| Maximum | 55.4 |
| Zeros | 20 |
| Zeros (%) | 1.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.04 |
| Q1 | 0.6 |
| Median | 1.445 |
| Q3 | 3.49 |
| 95-th Percentile | 13.027 |
| Maximum | 55.4 |
| Range | 55.4 |
| IQR | 2.89 |
| Mean | 3.3526 |
|---|---|
| Standard Deviation | 5.7487 |
| Variance | 33.0476 |
| Sum | 6953.33 |
| Skewness | 4.3163 |
| Kurtosis | 24.5375 |
| Coefficient of Variation | 1.7147 |
numerical
| Distinct Count | 501 |
|---|---|
| Unique (%) | 24.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 1.584 |
| Minimum | 0 |
| Maximum | 22.31 |
| Zeros | 199 |
| Zeros (%) | 9.6% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0 |
| Q1 | 0.1 |
| Median | 0.71 |
| Q3 | 1.9 |
| 95-th Percentile | 6.1405 |
| Maximum | 22.31 |
| Range | 22.31 |
| IQR | 1.8 |
| Mean | 1.584 |
|---|---|
| Standard Deviation | 2.6958 |
| Variance | 7.2676 |
| Sum | 3285.3 |
| Skewness | 3.6449 |
| Kurtosis | 16.6287 |
| Coefficient of Variation | 1.7019 |
numerical
| Distinct Count | 1307 |
|---|---|
| Unique (%) | 63.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.8389 |
| Minimum | 0 |
| Maximum | 64.24 |
| Zeros | 72 |
| Zeros (%) | 3.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.01 |
| Q1 | 1.145 |
| Median | 5.61 |
| Q3 | 22.34 |
| 95-th Percentile | 42.522 |
| Maximum | 64.24 |
| Range | 64.24 |
| IQR | 21.195 |
| Mean | 12.8389 |
|---|---|
| Standard Deviation | 14.9566 |
| Variance | 223.7009 |
| Sum | 26627.93 |
| Skewness | 1.1647 |
| Kurtosis | 0.2275 |
| Coefficient of Variation | 1.1649 |
numerical
| Distinct Count | 1522 |
|---|---|
| Unique (%) | 73.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 15.5803 |
| Minimum | 0.05 |
| Maximum | 72.74 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.05 |
|---|---|
| 5-th Percentile | 0.8595 |
| Q1 | 4.0025 |
| Median | 13.63 |
| Q3 | 23.72 |
| 95-th Percentile | 38.1445 |
| Maximum | 72.74 |
| Range | 72.69 |
| IQR | 19.7175 |
| Mean | 15.5803 |
|---|---|
| Standard Deviation | 12.8611 |
| Variance | 165.4069 |
| Sum | 32313.6 |
| Skewness | 1.0476 |
| Kurtosis | 1.3842 |
| Coefficient of Variation | 0.8255 |
numerical
| Distinct Count | 1980 |
|---|---|
| Unique (%) | 95.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 116.448 |
| Minimum | 1.02 |
| Maximum | 463.91 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.02 |
|---|---|
| 5-th Percentile | 6.624 |
| Q1 | 31.9475 |
| Median | 98.2 |
| Q3 | 178.51 |
| 95-th Percentile | 291.8985 |
| Maximum | 463.91 |
| Range | 462.89 |
| IQR | 146.5625 |
| Mean | 116.448 |
|---|---|
| Standard Deviation | 94.9286 |
| Variance | 9011.434 |
| Sum | 241513.11 |
| Skewness | 0.7646 |
| Kurtosis | -0.237 |
| Coefficient of Variation | 0.8152 |
numerical
| Distinct Count | 1577 |
|---|---|
| Unique (%) | 76.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 17.395 |
| Minimum | 0.07 |
| Maximum | 191.75 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.07 |
|---|---|
| 5-th Percentile | 1.3165 |
| Q1 | 5.1825 |
| Median | 12.16 |
| Q3 | 24 |
| 95-th Percentile | 47.607 |
| Maximum | 191.75 |
| Range | 191.68 |
| IQR | 18.8175 |
| Mean | 17.395 |
|---|---|
| Standard Deviation | 19.817 |
| Variance | 392.7147 |
| Sum | 36077.21 |
| Skewness | 3.9626 |
| Kurtosis | 25.1503 |
| Coefficient of Variation | 1.1392 |
numerical
| Distinct Count | 2074 |
|---|---|
| Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.3738 |
| Minimum | 1.7014 |
| Maximum | 20.4134 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.7014 |
|---|---|
| 5-th Percentile | 2.8371 |
| Q1 | 4.4335 |
| Median | 6.1969 |
| Q3 | 8.1496 |
| 95-th Percentile | 10.3428 |
| Maximum | 20.4134 |
| Range | 18.712 |
| IQR | 3.7161 |
| Mean | 6.3738 |
|---|---|
| Standard Deviation | 2.3791 |
| Variance | 5.6603 |
| Sum | 13219.2983 |
| Skewness | 0.3744 |
| Kurtosis | 0.2049 |
| Coefficient of Variation | 0.3733 |
numerical
| Distinct Count | 78 |
|---|---|
| Unique (%) | 3.8% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.6432 |
| Minimum | 19 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19 |
|---|---|
| 5-th Percentile | 51 |
| Q1 | 83 |
| Median | 92 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 80 |
| IQR | 14 |
| Mean | 86.6432 |
|---|---|
| Standard Deviation | 14.9612 |
| Variance | 223.8369 |
| Sum | 179698 |
| Skewness | -1.9149 |
| Kurtosis | 3.5448 |
| Coefficient of Variation | 0.1727 |
numerical
| Distinct Count | 362 |
|---|---|
| Unique (%) | 17.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.6951 |
| Minimum | 0.1 |
| Maximum | 116.2 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.1 |
|---|---|
| 5-th Percentile | 2.9 |
| Q1 | 6 |
| Median | 10.4 |
| Q3 | 15.4 |
| 95-th Percentile | 30.27 |
| Maximum | 116.2 |
| Range | 116.1 |
| IQR | 9.4 |
| Mean | 12.6951 |
|---|---|
| Standard Deviation | 10.8287 |
| Variance | 117.2611 |
| Sum | 26329.6 |
| Skewness | 3.6664 |
| Kurtosis | 22.6907 |
| Coefficient of Variation | 0.853 |
numerical
| Distinct Count | 1796 |
|---|---|
| Unique (%) | 86.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 595.0553 |
| Minimum | 240.4 |
| Maximum | 1317.7 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 240.4 |
|---|---|
| 5-th Percentile | 328.125 |
| Q1 | 437.325 |
| Median | 588.35 |
| Q3 | 710.125 |
| 95-th Percentile | 935.445 |
| Maximum | 1317.7 |
| Range | 1077.3 |
| IQR | 272.8 |
| Mean | 595.0553 |
|---|---|
| Standard Deviation | 194.1573 |
| Variance | 37697.0431 |
| Sum | 1.2341e+06 |
| Skewness | 0.644 |
| Kurtosis | 0.357 |
| Coefficient of Variation | 0.3263 |
numerical
| Distinct Count | 83 |
|---|---|
| Unique (%) | 4.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 1.4701 |
| Minimum | 0 |
| Maximum | 9.2 |
| Zeros | 31 |
| Zeros (%) | 1.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.1 |
| Q1 | 0.3 |
| Median | 0.7 |
| Q3 | 2.5 |
| 95-th Percentile | 4.4 |
| Maximum | 9.2 |
| Range | 9.2 |
| IQR | 2.2 |
| Mean | 1.4701 |
|---|---|
| Standard Deviation | 1.5467 |
| Variance | 2.3922 |
| Sum | 3048.9 |
| Skewness | 1.469 |
| Kurtosis | 2.2657 |
| Coefficient of Variation | 1.0521 |
numerical
| Distinct Count | 90 |
|---|---|
| Unique (%) | 4.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 84.7931 |
| Minimum | 2 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 2 |
|---|---|
| 5-th Percentile | 48 |
| Q1 | 84 |
| Median | 88 |
| Q3 | 95 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 97 |
| IQR | 11 |
| Mean | 84.7931 |
|---|---|
| Standard Deviation | 15.734 |
| Variance | 247.5587 |
| Sum | 175860.8966 |
| Skewness | -2.3798 |
| Kurtosis | 6.6278 |
| Coefficient of Variation | 0.1856 |
numerical
| Distinct Count | 74 |
|---|---|
| Unique (%) | 3.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.7208 |
| Minimum | 8 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 8 |
|---|---|
| 5-th Percentile | 53 |
| Q1 | 82 |
| Median | 93 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 91 |
| IQR | 15 |
| Mean | 86.7208 |
|---|---|
| Standard Deviation | 14.6434 |
| Variance | 214.43 |
| Sum | 179859 |
| Skewness | -1.8239 |
| Kurtosis | 3.3041 |
| Coefficient of Variation | 0.1689 |
numerical
| Distinct Count | 1766 |
|---|---|
| Unique (%) | 85.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 10668.9849 |
| Minimum | 0 |
| Maximum | 615058 |
| Zeros | 1 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 8.43 |
| Q1 | 67.0125 |
| Median | 583.6 |
| Q3 | 3110.225 |
| 95-th Percentile | 49029 |
| Maximum | 615058 |
| Range | 615058 |
| IQR | 3043.2125 |
| Mean | 10668.9849 |
|---|---|
| Standard Deviation | 50450.6869 |
| Variance | 2.5453e+09 |
| Sum | 2.2127e+07 |
| Skewness | 9.2523 |
| Kurtosis | 93.7958 |
| Coefficient of Variation | 4.7287 |
#using label encoding for the categorical columns with text values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['Country'] = le.fit_transform(dataset['Country'])
dataset['Year'] = le.fit_transform(dataset['Year'])
dataset.head()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Alcohol | HIV | BCG | Adult Mortality | ChildMortality | ... | Milk Consumption | Fish and Seafood | Medical Expenditure | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 11 | 76 | 26.2 | 22.9 | 5.03 | 100.0 | 97.0 | 103 | 868 | ... | 301.27 | 5.86 | 4.795327 | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 | 99.0 | 9.29 |
| 1 | 1 | 12 | 77 | 26.3 | 23.1 | 4.43 | 100.0 | 96.0 | 103 | 868 | ... | 299.85 | 4.97 | 5.055262 | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 | 99.0 | 9.29 |
| 2 | 1 | 13 | 77 | 26.4 | 23.6 | 4.28 | 100.0 | 99.0 | 100 | 868 | ... | 303.72 | 4.87 | 5.385599 | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 | 99.0 | 8.29 |
| 3 | 2 | 14 | 75 | 25.4 | 33.9 | 0.54 | 200.0 | 99.0 | 98 | 60319 | ... | 151.06 | 4.40 | 6.547214 | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 | 95.0 | 3107.90 |
| 4 | 2 | 15 | 76 | 25.5 | 33.9 | 0.55 | 200.0 | 99.0 | 96 | 60319 | ... | 125.37 | 4.16 | 6.978492 | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 | 95.0 | 3208.10 |
5 rows × 27 columns
# plotting the correleation between all the features in the dataset
# to identify the strongly related related variables with LE
#Using Pearson Correlation
plt.figure(figsize=(20,15))
correlation_matrix = dataset.corr()
sns.heatmap(correlation_matrix, annot=True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')
plt.show()
#Correlation with output variable Life expectancy
cor_target = correlation_matrix["Life expectancy"]
#Viewing highly correlated features
relevant_features = cor_target[cor_target>=-1]
relevant_features.sort_values(ascending=False)
Life expectancy 1.000000 Eggs Consumption 0.708943 Milk Consumption 0.659305 BMI 0.654637 Polio 0.649462 Diphtheria 0.641525 Pig Meat 0.591232 Poultry Meat 0.590431 Alcohol 0.450114 Bovine Meat 0.439087 Medical Expenditure 0.364535 Fish and Seafood 0.330970 HepatitisB 0.244216 BCG 0.235935 Year 0.183378 Mutton & Goat meat 0.056031 Population 0.010906 Country -0.067061 Other Meat -0.090164 Tuberculosis -0.106261 ChildMortality -0.250402 HIV -0.326044 Suicides -0.429654 NCD -0.706556 Env Pollution -0.708975 ChildMalnutrition -0.882725 Adult Mortality -0.948039 Name: Life expectancy, dtype: float64
# dropping irrelevent features from the dataset
# HepatitisB 0.244216
# BCG 0.235935
# Year 0.183378
# Mutton & Goat meat 0.056031
# Population 0.010906
# Country -0.067061
# Other Meat -0.090164
# Tuberculosis -0.106261
dataset=dataset.drop('HepatitisB',axis=1)
dataset=dataset.drop('BCG',axis=1)
dataset=dataset.drop('Year',axis=1)
dataset=dataset.drop('Mutton & Goat meat',axis=1)
dataset=dataset.drop('Population',axis=1)
dataset=dataset.drop('Country',axis=1)
dataset=dataset.drop('Other Meat',axis=1)
dataset=dataset.drop('Tuberculosis',axis=1)
dataset.head()
| Life expectancy | BMI | ChildMalnutrition | Alcohol | HIV | Adult Mortality | ChildMortality | Eggs Consumption | Bovine Meat | Pig Meat | Poultry Meat | Milk Consumption | Fish and Seafood | Medical Expenditure | Diphtheria | Suicides | NCD | Env Pollution | Polio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 76 | 26.2 | 22.9 | 5.03 | 100.0 | 103 | 868 | 7.72 | 21.24 | 11.03 | 13.41 | 301.27 | 5.86 | 4.795327 | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 |
| 1 | 77 | 26.3 | 23.1 | 4.43 | 100.0 | 103 | 868 | 12.69 | 22.40 | 11.04 | 12.76 | 299.85 | 4.97 | 5.055262 | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 |
| 2 | 77 | 26.4 | 23.6 | 4.28 | 100.0 | 100 | 868 | 12.45 | 22.50 | 10.88 | 13.23 | 303.72 | 4.87 | 5.385599 | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 |
| 3 | 75 | 25.4 | 33.9 | 0.54 | 200.0 | 98 | 60319 | 7.93 | 5.43 | 0.00 | 6.86 | 151.06 | 4.40 | 6.547214 | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 |
| 4 | 76 | 25.5 | 33.9 | 0.55 | 200.0 | 96 | 60319 | 8.65 | 5.35 | 0.00 | 6.64 | 125.37 | 4.16 | 6.978492 | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 |
# Feature Scaling
#transform the data to be on same scale using sklearn's StandardScaler()
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X = dataset.drop('Life expectancy',axis=1)
y = dataset['Life expectancy'].astype('int')
X
| BMI | ChildMalnutrition | Alcohol | HIV | Adult Mortality | ChildMortality | Eggs Consumption | Bovine Meat | Pig Meat | Poultry Meat | Milk Consumption | Fish and Seafood | Medical Expenditure | Diphtheria | Suicides | NCD | Env Pollution | Polio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26.2 | 22.9 | 5.03 | 100.0 | 103 | 868 | 7.72 | 21.24 | 11.03 | 13.41 | 301.27 | 5.86 | 4.795327 | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 |
| 1 | 26.3 | 23.1 | 4.43 | 100.0 | 103 | 868 | 12.69 | 22.40 | 11.04 | 12.76 | 299.85 | 4.97 | 5.055262 | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 |
| 2 | 26.4 | 23.6 | 4.28 | 100.0 | 100 | 868 | 12.45 | 22.50 | 10.88 | 13.23 | 303.72 | 4.87 | 5.385599 | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 |
| 3 | 25.4 | 33.9 | 0.54 | 200.0 | 98 | 60319 | 7.93 | 5.43 | 0.00 | 6.86 | 151.06 | 4.40 | 6.547214 | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 |
| 4 | 25.5 | 33.9 | 0.55 | 200.0 | 96 | 60319 | 8.65 | 5.35 | 0.00 | 6.64 | 125.37 | 4.16 | 6.978492 | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2851 | 23.7 | 48.3 | 3.91 | 40000.0 | 440 | 52218 | 1.78 | 7.66 | 2.81 | 6.69 | 34.21 | 2.99 | 8.081738 | 93.0 | 34.3 | 842.2 | 4.6 | 93.0 |
| 2852 | 23.7 | 46.2 | 3.93 | 33000.0 | 407 | 52218 | 1.78 | 7.53 | 2.88 | 6.72 | 31.07 | 2.91 | 6.918353 | 95.0 | 33.1 | 826.4 | 4.5 | 95.0 |
| 2853 | 23.7 | 44.3 | 4.11 | 28000.0 | 383 | 52218 | 1.75 | 7.37 | 2.65 | 4.97 | 31.90 | 2.82 | 7.110148 | 95.0 | 31.4 | 810.2 | 4.5 | 95.0 |
| 2854 | 23.8 | 42.8 | 4.22 | 25000.0 | 358 | 52218 | 1.93 | 7.26 | 1.84 | 4.49 | 30.54 | 3.39 | 8.133524 | 91.0 | 30.8 | 804.3 | 4.4 | 92.0 |
| 2855 | 23.8 | 41.7 | 3.84 | 24000.0 | 346 | 52218 | 1.84 | 7.25 | 1.74 | 4.67 | 27.38 | 3.82 | 7.452066 | 87.0 | 30.7 | 800.1 | 4.3 | 88.0 |
2074 rows × 18 columns
y
0 76
1 77
2 77
3 75
4 76
..
2851 52
2852 55
2853 56
2854 58
2855 59
Name: Life expectancy, Length: 2074, dtype: int32
X = scale.fit_transform(X)
X
array([[ 0.69850001, -0.60006687, -0.04445178, ..., -0.54018371,
-0.62734346, 0.83874704],
[ 0.75026479, -0.59081548, -0.19637236, ..., -0.45311988,
-0.69201418, 0.83874704],
[ 0.80202956, -0.567687 , -0.2343525 , ..., -0.5597602 ,
-0.69201418, 0.83874704],
...,
[-0.59561939, 0.38983192, -0.27739667, ..., 1.10836225,
1.95948557, 0.56552109],
[-0.54385461, 0.32044649, -0.24954456, ..., 1.07796719,
1.89481484, 0.36060162],
[-0.54385461, 0.26956384, -0.34576093, ..., 1.05633002,
1.83014412, 0.08737566]])
#splitting the data into my train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)
#from sklearn import utils
#print(utils.multiclass.type_of_target(y_train))
#print(utils.multiclass.type_of_target(y_train.astype('int')))
y_train = y_train.astype('int')
y_test = y_test.astype('int')
np.unique(y_train, return_counts=True)
(array([39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]),
array([ 1, 1, 1, 5, 6, 7, 7, 9, 12, 13, 12, 24, 24,
19, 19, 17, 21, 20, 33, 28, 27, 24, 27, 26, 26, 18,
24, 37, 38, 33, 46, 57, 65, 62, 71, 107, 74, 47, 36,
66, 76, 70, 72, 37, 5, 1], dtype=int64))
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train,y_train = ros.fit_resample(X, y)
np.unique(y_train, return_counts=True)
(array([39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]),
array([137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137,
137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137,
137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137,
137, 137, 137, 137, 137, 137, 137], dtype=int64))
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from math import sqrt
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
print('Gaussian Naive Bayes')
print('------------------------------')
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print('Accuracy : {}'.format(gnb.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
Gaussian Naive Bayes ------------------------------ Accuracy : 0.30818619582664525 MAE : 1.63 MSE : 5.96 RMSE : 2.440956 R2_SCORE : 0.938516
# BernoulliNB Naive Bayes
from sklearn.naive_bayes import BernoulliNB
print('Bernoulli Naive Bayes')
print('------------------------------')
model = BernoulliNB().fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy : {}'.format(model.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
Bernoulli Naive Bayes ------------------------------ Accuracy : 0.22150882825040127 MAE : 2.95 MSE : 23.42 RMSE : 4.839478 R2_SCORE : 0.758323
from sklearn.linear_model import LogisticRegression
highAcc =0
maxc = 1
for c in range(1,20):
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=c,
fit_intercept=True, intercept_scaling=1, class_weight=None,
random_state=None, solver='lbfgs', max_iter=100, multi_class='auto',
verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
model = model.fit(X_train,y_train)
#using the trained model on the test set
pred_y = model.predict(X_test)
if(highAcc < model.score(X_test, y_test)):
highAcc = model.score(X_test, y_test)
maxc = c
print("C = {} , Accuracy = {}".format(maxc, highAcc))
C = 14 , Accuracy = 0.38202247191011235
print('Logistic Regression , C=14')
print('------------------------------')
# initialising the classifier for c=14
model = LogisticRegression(C=14)
# applying the model for the test values
model = model.fit(X_train,y_train)
# predicting the out put values for test inputs
y_pred = model.predict(X_test)
print('Accuracy : {}'.format(model.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
Logistic Regression , C=14 ------------------------------ Accuracy : 0.38202247191011235 MAE : 1.16 MSE : 3.83 RMSE : 1.958232 R2_SCORE : 0.960430
from sklearn.neighbors import KNeighborsClassifier
print('KNeighborsClassifier')
print('------------------------------')
# checking the accuracy while looping throught the neighbors count from 5 to 10
highAcc =0
maxn=0
for n in range(5,20):
knn = KNeighborsClassifier(n_neighbors = n)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
if(highAcc < knn.score(X_test, y_test)):
highAcc = knn.score(X_test, y_test)
maxn= n
#print('n_neighbors {} -- Accuracy : {}'.format(n, knn.score(X_test, y_test) ))
print('n = {} , MaxAccuracy = {}'.format(maxn, highAcc ))
KNeighborsClassifier ------------------------------ n = 5 , MaxAccuracy = 0.8507223113964687
print('KNeighborsClassifier')
print('------------------------------')
# initialising the classifier for n=5
knn = KNeighborsClassifier(n_neighbors = 5)
# applying the model for the test values
knn.fit(X_train,y_train)
# predicting the out put values for test inputs
y_pred = knn.predict(X_test)
print('Accuracy : {}'.format(knn.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
KNeighborsClassifier ------------------------------ Accuracy : 0.8507223113964687 MAE : 0.17 MSE : 0.21 RMSE : 0.458555 R2_SCORE : 0.997830
#SVC, NuSVC and LinearSVC are classes capable of performing binary and multi-class classification on a dataset.
#Importing the necessary packages and libaries
from sklearn import svm
print('SUPPORT VECTOR CLASSIFICATION ')
#print('------------------------------')
kernal = ["linear","rbf",'poly','sigmoid']
for k in kernal:
#print('Kernal - {}'.format(k))
print('------------------------------')
for g in ["auto" ,"scale"]:
highAcc = 0
Maxc=1
for c in range(1,15):
# higher value of c gives l2 penality --> overfitting
model = svm.SVC(C=c, kernel=k, gamma= g, decision_function_shape='ovo')
model = model.fit(X_train,y_train)
#using the trained model on the test set
pred_y = model.predict(X_test)
if highAcc < model.score(X_test,y_test):
highAcc = model.score(X_test,y_test)
Maxc = c
print("Kernal = {} , C = {} , gamma = {} - MaxAccuracy = {}".format(k,Maxc,g,highAcc))
#print('------------------------------')
SUPPORT VECTOR CLASSIFICATION ------------------------------ Kernal = linear , C = 10 , gamma = auto - MaxAccuracy = 0.6773675762439807 Kernal = linear , C = 10 , gamma = scale - MaxAccuracy = 0.6773675762439807 ------------------------------ Kernal = rbf , C = 14 , gamma = auto - MaxAccuracy = 0.8154093097913323 Kernal = rbf , C = 14 , gamma = scale - MaxAccuracy = 0.7479935794542536 ------------------------------ Kernal = poly , C = 14 , gamma = auto - MaxAccuracy = 0.7897271268057785 Kernal = poly , C = 14 , gamma = scale - MaxAccuracy = 0.6821829855537721 ------------------------------ Kernal = sigmoid , C = 4 , gamma = auto - MaxAccuracy = 0.22632423756019263 Kernal = sigmoid , C = 5 , gamma = scale - MaxAccuracy = 0.28892455858747995
print('Linear SVC')
print('------------------------------')
# initialising the classifier
linear = svm.SVC(kernel='linear', C=10, decision_function_shape='ovo')
# applying the model for the test values
linear.fit(X_train,y_train)
# predicting the out put values for test inputs
y_pred = linear.predict(X_test)
print('Accuracy : {}'.format(linear.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
Linear SVC ------------------------------ Accuracy : 0.6773675762439807 MAE : 0.4 MSE : 0.58 RMSE : 0.763325 R2_SCORE : 0.993987
print('Radial Basis Function - SVC')
print('------------------------------')
# initialising the classifier
rbf = svm.SVC(kernel='rbf', gamma="auto", C=14, decision_function_shape='ovo')
# applying the model for the test values
rbf.fit(X_train,y_train)
# predicting the out put values for test inputs
y_pred = rbf.predict(X_test)
print('Accuracy : {}'.format(rbf.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
Radial Basis Function - SVC ------------------------------ Accuracy : 0.8154093097913323 MAE : 0.21 MSE : 0.29 RMSE : 0.541978 R2_SCORE : 0.996969
# polynomial kernel function
poly = svm.SVC(kernel='poly', degree=3, C=14, gamma="auto", decision_function_shape='ovo').fit(X_train, y_train)
print('Polynomial Kernal Function - SVC')
print('------------------------------')
# predicting the out put values for test inputs
y_pred = rbf.predict(X_test)
print('Accuracy : {}'.format(poly.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
Polynomial Kernal Function - SVC ------------------------------ Accuracy : 0.7897271268057785 MAE : 0.21 MSE : 0.29 RMSE : 0.541978 R2_SCORE : 0.996969
#Sigmoid
sig = svm.SVC(kernel='sigmoid', C=5, gamma="scale", decision_function_shape='ovo').fit(X_train, y_train)
print('Sigmoid Function - SVC')
print('------------------------------')
# predicting the out put values for test inputs
y_pred = rbf.predict(X_test)
print('Accuracy : {}'.format(sig.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
Sigmoid Function - SVC ------------------------------ Accuracy : 0.28892455858747995 MAE : 0.21 MSE : 0.29 RMSE : 0.541978 R2_SCORE : 0.996969
# To ignore warnings thrown by dataprep library
import warnings
warnings.filterwarnings('ignore')
# importing the dataprep library
import dataprep
from dataprep.eda import create_report
# generating the visual using create_report method
final_report = create_report(dataset, title='Final Dataset')
final_report
| Number of Variables | 19 |
|---|---|
| Number of Rows | 2074 |
| Missing Cells | 0 |
| Missing Cells (%) | 0.0% |
| Duplicate Rows | 0 |
| Duplicate Rows (%) | 0.0% |
| Total Size in Memory | 324.1 KB |
| Average Row Size in Memory | 160.0 B |
| Numerical | 19 |
|---|
numerical
| Distinct Count | 46 |
|---|---|
| Unique (%) | 2.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 68.7083 |
| Minimum | 39 |
| Maximum | 84 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 39 |
|---|---|
| 5-th Percentile | 49.65 |
| Q1 | 62 |
| Median | 71 |
| Q3 | 76 |
| 95-th Percentile | 81 |
| Maximum | 84 |
| Range | 45 |
| IQR | 14 |
| Mean | 68.7083 |
|---|---|
| Standard Deviation | 10.0026 |
| Variance | 100.0523 |
| Sum | 142501 |
| Skewness | -0.7547 |
| Kurtosis | -0.3761 |
| Coefficient of Variation | 0.1456 |
numerical
| Distinct Count | 94 |
|---|---|
| Unique (%) | 4.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 24.8506 |
| Minimum | 19.8 |
| Maximum | 29.1 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19.8 |
|---|---|
| 5-th Percentile | 21.4 |
| Q1 | 23.2 |
| Median | 25.5 |
| Q3 | 26.3 |
| 95-th Percentile | 27.3 |
| Maximum | 29.1 |
| Range | 9.3 |
| IQR | 3.1 |
| Mean | 24.8506 |
|---|---|
| Standard Deviation | 1.9323 |
| Variance | 3.7337 |
| Sum | 51540.2 |
| Skewness | -0.5303 |
| Kurtosis | -0.7565 |
| Coefficient of Variation | 0.07776 |
numerical
| Distinct Count | 652 |
|---|---|
| Unique (%) | 31.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 35.8725 |
| Minimum | 9.3 |
| Maximum | 88.4 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 9.3 |
|---|---|
| 5-th Percentile | 11.7 |
| Q1 | 18.8 |
| Median | 29.25 |
| Q3 | 50.5 |
| 95-th Percentile | 77.7 |
| Maximum | 88.4 |
| Range | 79.1 |
| IQR | 31.7 |
| Mean | 35.8725 |
|---|---|
| Standard Deviation | 21.6236 |
| Variance | 467.5794 |
| Sum | 74399.5 |
| Skewness | 0.7696 |
| Kurtosis | -0.6317 |
| Coefficient of Variation | 0.6028 |
numerical
| Distinct Count | 1006 |
|---|---|
| Unique (%) | 48.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 5.2056 |
| Minimum | 0 |
| Maximum | 17.87 |
| Zeros | 25 |
| Zeros (%) | 1.2% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.19 |
| Q1 | 1.68 |
| Median | 4.485 |
| Q3 | 8.1 |
| 95-th Percentile | 12.1 |
| Maximum | 17.87 |
| Range | 17.87 |
| IQR | 6.42 |
| Mean | 5.2056 |
|---|---|
| Standard Deviation | 3.9504 |
| Variance | 15.6055 |
| Sum | 10796.33 |
| Skewness | 0.5139 |
| Kurtosis | -0.6963 |
| Coefficient of Variation | 0.7589 |
numerical
| Distinct Count | 219 |
|---|---|
| Unique (%) | 10.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 9416.6428 |
| Minimum | 100 |
| Maximum | 290000 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 100 |
|---|---|
| 5-th Percentile | 100 |
| Q1 | 500 |
| Median | 3800 |
| Q3 | 9416.6428 |
| 95-th Percentile | 42350 |
| Maximum | 290000 |
| Range | 289900 |
| IQR | 8916.6428 |
| Mean | 9416.6428 |
|---|---|
| Standard Deviation | 22287.9745 |
| Variance | 4.9675e+08 |
| Sum | 1.953e+07 |
| Skewness | 6.8164 |
| Kurtosis | 62.0487 |
| Coefficient of Variation | 2.3669 |
numerical
| Distinct Count | 445 |
|---|---|
| Unique (%) | 21.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 193.2985 |
| Minimum | 49 |
| Maximum | 683 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 49 |
|---|---|
| 5-th Percentile | 62.65 |
| Q1 | 101 |
| Median | 161 |
| Q3 | 255 |
| 95-th Percentile | 445.35 |
| Maximum | 683 |
| Range | 634 |
| IQR | 154 |
| Mean | 193.2985 |
|---|---|
| Standard Deviation | 119.867 |
| Variance | 14368.0976 |
| Sum | 400901 |
| Skewness | 1.2728 |
| Kurtosis | 1.4242 |
| Coefficient of Variation | 0.6201 |
numerical
| Distinct Count | 124 |
|---|---|
| Unique (%) | 6.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 75008.4851 |
| Minimum | 18 |
| Maximum | 2.0254e+06 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 18 |
|---|---|
| 5-th Percentile | 101 |
| Q1 | 979 |
| Median | 6704 |
| Q3 | 52218 |
| 95-th Percentile | 219826 |
| Maximum | 2.0254e+06 |
| Range | 2.0254e+06 |
| IQR | 51239 |
| Mean | 75008.4851 |
|---|---|
| Standard Deviation | 254538.4035 |
| Variance | 6.479e+10 |
| Sum | 1.5557e+08 |
| Skewness | 6.1166 |
| Kurtosis | 39.4687 |
| Coefficient of Variation | 3.3935 |
numerical
| Distinct Count | 1105 |
|---|---|
| Unique (%) | 53.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.6446 |
| Minimum | 0.01 |
| Maximum | 22.35 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.01 |
|---|---|
| 5-th Percentile | 0.46 |
| Q1 | 1.75 |
| Median | 6.13 |
| Q3 | 10.18 |
| 95-th Percentile | 15.721 |
| Maximum | 22.35 |
| Range | 22.34 |
| IQR | 8.43 |
| Mean | 6.6446 |
|---|---|
| Standard Deviation | 5.0699 |
| Variance | 25.7043 |
| Sum | 13780.87 |
| Skewness | 0.5265 |
| Kurtosis | -0.6652 |
| Coefficient of Variation | 0.763 |
numerical
| Distinct Count | 1381 |
|---|---|
| Unique (%) | 66.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 11.6437 |
| Minimum | 0.26 |
| Maximum | 59.09 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.26 |
|---|---|
| 5-th Percentile | 1.5795 |
| Q1 | 4.99 |
| Median | 8.325 |
| Q3 | 17.35 |
| 95-th Percentile | 28.1235 |
| Maximum | 59.09 |
| Range | 58.83 |
| IQR | 12.36 |
| Mean | 11.6437 |
|---|---|
| Standard Deviation | 9.2928 |
| Variance | 86.3565 |
| Sum | 24149.09 |
| Skewness | 1.5618 |
| Kurtosis | 3.4582 |
| Coefficient of Variation | 0.7981 |
numerical
| Distinct Count | 1307 |
|---|---|
| Unique (%) | 63.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.8389 |
| Minimum | 0 |
| Maximum | 64.24 |
| Zeros | 72 |
| Zeros (%) | 3.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.01 |
| Q1 | 1.145 |
| Median | 5.61 |
| Q3 | 22.34 |
| 95-th Percentile | 42.522 |
| Maximum | 64.24 |
| Range | 64.24 |
| IQR | 21.195 |
| Mean | 12.8389 |
|---|---|
| Standard Deviation | 14.9566 |
| Variance | 223.7009 |
| Sum | 26627.93 |
| Skewness | 1.1647 |
| Kurtosis | 0.2275 |
| Coefficient of Variation | 1.1649 |
numerical
| Distinct Count | 1522 |
|---|---|
| Unique (%) | 73.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 15.5803 |
| Minimum | 0.05 |
| Maximum | 72.74 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.05 |
|---|---|
| 5-th Percentile | 0.8595 |
| Q1 | 4.0025 |
| Median | 13.63 |
| Q3 | 23.72 |
| 95-th Percentile | 38.1445 |
| Maximum | 72.74 |
| Range | 72.69 |
| IQR | 19.7175 |
| Mean | 15.5803 |
|---|---|
| Standard Deviation | 12.8611 |
| Variance | 165.4069 |
| Sum | 32313.6 |
| Skewness | 1.0476 |
| Kurtosis | 1.3842 |
| Coefficient of Variation | 0.8255 |
numerical
| Distinct Count | 1980 |
|---|---|
| Unique (%) | 95.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 116.448 |
| Minimum | 1.02 |
| Maximum | 463.91 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.02 |
|---|---|
| 5-th Percentile | 6.624 |
| Q1 | 31.9475 |
| Median | 98.2 |
| Q3 | 178.51 |
| 95-th Percentile | 291.8985 |
| Maximum | 463.91 |
| Range | 462.89 |
| IQR | 146.5625 |
| Mean | 116.448 |
|---|---|
| Standard Deviation | 94.9286 |
| Variance | 9011.434 |
| Sum | 241513.11 |
| Skewness | 0.7646 |
| Kurtosis | -0.237 |
| Coefficient of Variation | 0.8152 |
numerical
| Distinct Count | 1577 |
|---|---|
| Unique (%) | 76.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 17.395 |
| Minimum | 0.07 |
| Maximum | 191.75 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.07 |
|---|---|
| 5-th Percentile | 1.3165 |
| Q1 | 5.1825 |
| Median | 12.16 |
| Q3 | 24 |
| 95-th Percentile | 47.607 |
| Maximum | 191.75 |
| Range | 191.68 |
| IQR | 18.8175 |
| Mean | 17.395 |
|---|---|
| Standard Deviation | 19.817 |
| Variance | 392.7147 |
| Sum | 36077.21 |
| Skewness | 3.9626 |
| Kurtosis | 25.1503 |
| Coefficient of Variation | 1.1392 |
numerical
| Distinct Count | 2074 |
|---|---|
| Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.3738 |
| Minimum | 1.7014 |
| Maximum | 20.4134 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.7014 |
|---|---|
| 5-th Percentile | 2.8371 |
| Q1 | 4.4335 |
| Median | 6.1969 |
| Q3 | 8.1496 |
| 95-th Percentile | 10.3428 |
| Maximum | 20.4134 |
| Range | 18.712 |
| IQR | 3.7161 |
| Mean | 6.3738 |
|---|---|
| Standard Deviation | 2.3791 |
| Variance | 5.6603 |
| Sum | 13219.2983 |
| Skewness | 0.3744 |
| Kurtosis | 0.2049 |
| Coefficient of Variation | 0.3733 |
numerical
| Distinct Count | 78 |
|---|---|
| Unique (%) | 3.8% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.6432 |
| Minimum | 19 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19 |
|---|---|
| 5-th Percentile | 51 |
| Q1 | 83 |
| Median | 92 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 80 |
| IQR | 14 |
| Mean | 86.6432 |
|---|---|
| Standard Deviation | 14.9612 |
| Variance | 223.8369 |
| Sum | 179698 |
| Skewness | -1.9149 |
| Kurtosis | 3.5448 |
| Coefficient of Variation | 0.1727 |
numerical
| Distinct Count | 362 |
|---|---|
| Unique (%) | 17.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.6951 |
| Minimum | 0.1 |
| Maximum | 116.2 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.1 |
|---|---|
| 5-th Percentile | 2.9 |
| Q1 | 6 |
| Median | 10.4 |
| Q3 | 15.4 |
| 95-th Percentile | 30.27 |
| Maximum | 116.2 |
| Range | 116.1 |
| IQR | 9.4 |
| Mean | 12.6951 |
|---|---|
| Standard Deviation | 10.8287 |
| Variance | 117.2611 |
| Sum | 26329.6 |
| Skewness | 3.6664 |
| Kurtosis | 22.6907 |
| Coefficient of Variation | 0.853 |
numerical
| Distinct Count | 1796 |
|---|---|
| Unique (%) | 86.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 595.0553 |
| Minimum | 240.4 |
| Maximum | 1317.7 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 240.4 |
|---|---|
| 5-th Percentile | 328.125 |
| Q1 | 437.325 |
| Median | 588.35 |
| Q3 | 710.125 |
| 95-th Percentile | 935.445 |
| Maximum | 1317.7 |
| Range | 1077.3 |
| IQR | 272.8 |
| Mean | 595.0553 |
|---|---|
| Standard Deviation | 194.1573 |
| Variance | 37697.0431 |
| Sum | 1.2341e+06 |
| Skewness | 0.644 |
| Kurtosis | 0.357 |
| Coefficient of Variation | 0.3263 |
numerical
| Distinct Count | 83 |
|---|---|
| Unique (%) | 4.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 1.4701 |
| Minimum | 0 |
| Maximum | 9.2 |
| Zeros | 31 |
| Zeros (%) | 1.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.1 |
| Q1 | 0.3 |
| Median | 0.7 |
| Q3 | 2.5 |
| 95-th Percentile | 4.4 |
| Maximum | 9.2 |
| Range | 9.2 |
| IQR | 2.2 |
| Mean | 1.4701 |
|---|---|
| Standard Deviation | 1.5467 |
| Variance | 2.3922 |
| Sum | 3048.9 |
| Skewness | 1.469 |
| Kurtosis | 2.2657 |
| Coefficient of Variation | 1.0521 |
numerical
| Distinct Count | 74 |
|---|---|
| Unique (%) | 3.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.7208 |
| Minimum | 8 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 8 |
|---|---|
| 5-th Percentile | 53 |
| Q1 | 82 |
| Median | 93 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 91 |
| IQR | 15 |
| Mean | 86.7208 |
|---|---|
| Standard Deviation | 14.6434 |
| Variance | 214.43 |
| Sum | 179859 |
| Skewness | -1.8239 |
| Kurtosis | 3.3041 |
| Coefficient of Variation | 0.1689 |
#OBSERVATIONS:
#1. lONGER LOGEVITY --> Higher Egg and Meat Consumption ( fish and sea food - iceland , maldives
# Beef/ Bovine meat - Argentina )
#2. No effect of alcohol on logevity
#3. Poultry and Goat meat has no effect too.
#4. Higher Milk consumption shows higher longevity
# MODEL Accuracy MAE MSE RMSE R2_Score
# Gaussian Naive Bayes 31% 1.63 5.96 2.440956 0.938516
# Bernoulli Naive Bayes 22% 2.95 23.42 4.839478 0.758323
# Logistic Regression 38% 1.16 3.83 1.958232 0.96043
# KNN Classifier 85% 0.17 0.21 0.458555 0.99783
# Linear SVC 68% 0.4 0.58 0.763325 0.993987
# Radial Basis Function SVC 82% 0.21 0.29 0.541978 0.996969
# Polynomial SVC 79% 0.21 0.29 0.541978 0.996969
# Sigmoid SVC 29% 0.21 0.29 0.541978 0.996969